for graphic - plotly for data - pandas
# data
import pandas as pd #import library
import numpy as np
# graph
import plotly.express as px
import plotly.graph_objects as go
# sys
import sys
import os
# directory path
AppPath = sys.path[0]
data_folder = AppPath + '/data'
file = 'data.csv'
# printing the ways
print(data_folder)
print(AppPath)
# to raw data folder
os.chdir(data_folder)
# import the data
df = pd.read_csv(file, sep='~', parse_dates=True)
# converting dtypes
downcast = 'integer'
errors = 'coerce'
# to int
df['id']= pd.to_numeric(df['id'], downcast=downcast, errors=errors)
df['max_id'] = pd.to_numeric(df['max_id'], downcast=downcast, errors=errors)
df['min_id'] = pd.to_numeric(df['min_id'], downcast=downcast, errors=errors)
df['is_end'] = pd.to_numeric(df['is_end'], downcast=downcast, errors=errors)
df['is_warning'] = pd.to_numeric(df['is_warning'], downcast=downcast, errors=errors)
df['time_delta_min'] = df['time_delta_min'].astype('int64',errors='ignore')
# dt
df['date'] = pd.to_datetime(df['date'])
# float
df
df.info()
print('Number of starts of warnings: ',df[df['unique_warning'] == True].shape[0])
print('Numberof ends of warnings: ', df[df['last_end'] == True].shape[0])
# take from 1 march
df_region = df[(df['region'] == 'Київська') & (df['date']>= pd.to_datetime('3/1/2022'))]
# global var
region = 'Kyiv'
last_date = df_region['datetime'].max()
df_region
last_date
# selecting
warday = df_region[['month_num','month','war_day', 'unique_warning']]
# grouping
warday = warday.groupby(by=['month_num','month','war_day']).sum()
warday = warday.reset_index()
# plotting
warday_line = px.line(warday, x='war_day', y="unique_warning", color="month",
title=f'intensity of warnings by days in {region}')
warday_line.show()
# selecting
weekday = df_region[['weekday_num', 'weekday', 'unique_warning', 'month', 'month_num']]
# grouping
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday']).sum()
weekday = weekday.reset_index()
# printing
weekday_bar = px.bar(weekday, x="weekday", y="unique_warning", color="month", barmode="group",
title=f'The absolute number of warnings per weekday in {region}')
weekday_bar.show()
We count the median
# selecting
weekday = df_region[['weekday_num', 'weekday', 'unique_warning', 'month', 'month_num', 'war_day']] # addedwarday
# grouping 1
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday', 'war_day']).sum()
weekday = weekday.reset_index()
# grouping 2
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday']).median()
weekday = weekday.reset_index()
weekday
# printing
weekday_bar = px.bar(weekday, x="weekday", y="unique_warning", color="month", barmode="group",
title=f'median number of warnings per weekday in {region}')
weekday_bar.show()
# selecting
weekday = df_region[['weekday_num', 'weekday', 'unique_warning', 'month', 'month_num', 'war_day']] # addedwarday
# grouping 1
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday', 'war_day']).sum()
weekday = weekday.reset_index()
# printing
weekday_boxplot = px.box(weekday, x='month' ,y="unique_warning", color="weekday",
title=f'distribution of warnings per weekday per months in {region}')
weekday_boxplot.show()
# selecting
warday = df_region[['month','war_day', 'time_delta_min']]
# grouping
warday = warday.groupby(by=['war_day', 'month']).sum()
warday = warday.reset_index()
# plotting
warday_line = px.line(warday, x='war_day', y="time_delta_min", color="month")
warday_line.show()
# selecting
warday = df_region[['month','war_day', 'time_delta_min']].copy()
# grouping median
warday_median = warday.groupby(by=['war_day', 'month']).median()
warday_median['measurement'] = 'Median'
# grouping mean
warday_mean = warday.groupby(by=['war_day', 'month']).mean()
warday_mean['measurement'] = 'Mean'
# union
warday = pd.concat([warday_median, warday_mean])
warday = warday.reset_index()
# plotting
warday_line_mean = px.line(warday, x='war_day', y="time_delta_min", color="measurement",
title=f'median and mean duration of one warning by days of war in {region}')
warday_line_mean.show()
# selecting
weekday = df_region[['weekday_num', 'weekday', 'time_delta_min', 'month', 'month_num']] # addedwarday
# grouping 1
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday']).median()
weekday = weekday.reset_index()
# printing
weekday_bar = px.bar(weekday, x="weekday", y="time_delta_min", color="month", barmode="group",
title=f'median time of warnings per weekday in {region}')
weekday_bar.show()
# selecting
weekday = df_region[['weekday_num', 'weekday', 'time_delta_min', 'month', 'month_num', 'war_day']] # addedwarday
# grouping 1
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday', 'war_day']).sum()
weekday = weekday.reset_index()
weekday
# printing
weekday_boxplot = px.box(weekday, x='month' ,y="time_delta_min", color="weekday",
title=f'distribution of time in warnings per weekday per months in {region}')
weekday_boxplot.show()
# selecting
weekday = df_region[['weekday_num', 'weekday', 'time_delta_min', 'month', 'month_num', 'war_day']] # added warday
weekday = weekday[pd.isna(weekday['time_delta_min']) == False].sort_values(by=['month_num', 'month', 'weekday_num', 'weekday'])
# grouping 1
weekday
# printing
weekday_boxplot = px.box(weekday, x='month' ,y="time_delta_min", color="weekday",
title=f'distribution of time of 1 warning per weekday per months {region}')
weekday_boxplot.show()
# selecting
month_dist = df_region[['weekday_num', 'weekday', 'time_delta_min', 'month', 'month_num', 'war_day']] # added warday
month_dist = month_dist[pd.isna(month_dist['time_delta_min']) == False].sort_values(by=['month_num', 'month'])
# printing
hist = px.histogram(month_dist, x="time_delta_min", color="month", nbins=100, marginal="box" , facet_row="month",
title=f'histogram of time of 1 warning per months in {region}')
hist.show()
# function of counting the Kaplan Meier
def kaplan_meier(df, duration_column='', event_column=''):
df = df[df[duration_column] > 0]
durations = df.sort_values(duration_column)[duration_column].unique()
# Initialise the table
columns = ['duration', 'n_at_risk', 'n_events',
'survival_probability']
km = pd.DataFrame(columns=columns, dtype=np.number)
km = km.append(pd.DataFrame([[0, df.shape[0], 0, 1]],
columns=columns))
# Calculate survival probability for each duration
for i, t in enumerate(durations):
n = np.sum(df[duration_column]>=t)
d = np.sum((df[duration_column]==t) & (df[event_column]==True))
s = (1 - d / n) * km.loc[i, 'survival_probability']
km = km.append(pd.DataFrame([[t, n, d, s]],
index=[i+1],
columns=columns))
# km = km[(pd.isna(km['survival_probability']) == False)]
return km
# seleting necessary values
# per all time
df_km = df_region[['time_delta_min', 'last_end']]
df_km = df_km[(df_km['last_end'] == True) & (df_km['time_delta_min'] > 0)]
df_km['survival_probability'] = 1
df_km['event'] = 1
df_km.index = range(0, df_km.shape[0])
# per month
df_km_march = df_region[df_region['month']=='March']
df_km_april = df_region[df_region['month']=='April']
# counting the Kaplan Meier probability
km_march = kaplan_meier(df_km_march, duration_column='time_delta_min', event_column='last_end')
km_april = kaplan_meier(df_km_april, duration_column='time_delta_min', event_column='last_end')
# creating the plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=km_march['duration'],
y=km_march['survival_probability'],
line_shape='vh', name=f'March'))
fig.add_trace(go.Scatter(x=km_april['duration'],
y=km_april['survival_probability'],
line_shape='hv', name=f'April'))
fig.add_trace(go.Scatter(x=np.array([0, km_march['duration'].max()]), y=np.array([0.5, 0.5]), name="median",
line_shape='linear', line_color='rgb(0,176,246)'))
# fig.update_layout(legend=dict(y=0.5, traceorder='reversed', font_size=16))
fig.update_layout(title=f'Kaplan Meiers probability of warnings in {region}',
xaxis_title='duration of warnings (mins)',
yaxis_title='probaility',)
fig.show()
# selecting
df_regr = df_region[['month_num', 'month', 'war_day','unique_warning', 'time_delta_min']]
# grouping 1
wardays_warnings = df_regr[['month_num', 'month',
'war_day','unique_warning']].groupby(by=['month_num', 'month', 'war_day']).sum()
# grouping 2
wardays_durations = df_regr[['month_num', 'month',
'war_day','time_delta_min']].groupby(by=['month_num', 'month', 'war_day']).sum()
# joinings
df_durations_warnings = pd.merge(left=wardays_warnings, right=wardays_durations,
left_index=True, right_index=True)
df_durations_warnings = df_durations_warnings.reset_index()
df_durations_warnings['duration of warnings per day (mins)'] = df_durations_warnings['time_delta_min']
df_durations_warnings['number of warnings per day'] = df_durations_warnings['unique_warning']
# printing the figure
fig = px.scatter(df_durations_warnings, x='number of warnings per day', y='duration of warnings per day (mins)',
color='month', title=f'scatter of durations and number of warnings in {region}')
fig.show()
Now we can see that situation in Kyiv with air warnings in April is better than was in March. It has relationship to news from North that Russian army lossed the battle of Kyiv and overdislocated to South and East of Ukraine for attacking the OUF zone. For our hopes Russians will suck our cock